In tobiasgf/lulu: Post Clustering Curation of Amplicon Data for Reliable Biodiversity Metrics

output: word_document: default pdf_document: default html_document: default

setwd("~/analyses")
main_path <- getwd()
path <- file.path(main_path, "otutables_processing")
library(stringr)
library(dplyr)
library(tidyr)
library(ggplot2)

allFiles <- list.files(path)
all_plTabs <- allFiles[grepl("planttable$", allFiles)]
all_prTabs <- allFiles[grepl("planttable.luluprocessed$", allFiles)]
all_Tabs <-  c(all_plTabs,all_prTabs)
read_tabs <- file.path(path, all_Tabs)

# Vector for filtering, etc. at this step redundant, but included for safety
samples <- c("S001","S002","S003","S004","S005","S006","S007","S008","S067",
             "S009","S010","S011","S012","S013","S014","S040","S068","S015",
             "S016","S017","S018","S069","S070","S019","S020","S021","S022",
             "S024","S025","S026","S027","S041","S028","S029","S030","S032",
             "S033","S034","S035","S042","S036","S037","S038","S039","S086",
             "S087","S088","S089","S044","S071","S045","S046","S047","S048",
             "S049","S050","S051","S052","S053","S055","S056","S057","S058",
             "S090","S059","S060","S061","S062","S063","S064","S065","S066",
             "S072","S073","S074","S075","S076","S077","S078","S091","S079",
             "S080","S081","S082","S083","S084","S085","S092","S094","S095",
             "S096","S097","S098","S099","S100","S101","S102","S103","S104",
             "S106","S107","S108","S109","S133","S110","S111","S112","S113",
             "S114","S115","S116","S117","S118","S119","S120","S121","S122",
             "S123","S124","S134","S125","S126","S127","S129","S130","S131",
             "S132","S135","S136","S137")  

tab_name <- file.path(main_path,"Table_otu_taxonomy_plant_levels.txt")
otutaxonomy <- read.table(tab_name, sep="\t", header=TRUE, as.is=TRUE)

read_centr <- file.path(path, "otus_derep3")
allcentroids <- read.csv(read_centr,sep='\t',header=F,as.is=TRUE)
otusID <- seq(1,length(allcentroids$V1),2)
seqsID <- seq(2,length(allcentroids$V1),2)
otus <- allcentroids[otusID,]
seqs <- allcentroids[seqsID,]
otus <- gsub(">","",otus)

centroid_df <- data.frame(centroid = otus, sequence = seqs)

investigated_taxa <- c("Acer","Alnus","Avenella","Calamagrostis",
                       "Calluna","Centaurea","Cerastium","Erica","Fagus",
                       "Filipendula","Holcus","Littorella","Lysimachia","Menyanthes",
                       "Plantago","Poa")
ampmatrices <- list()
for (tt in 1:length(investigated_taxa)){
 reftaxon <- which(otutaxonomy$genus == investigated_taxa[tt])
 taxinfo <- otutaxonomy[reftaxon,c("pident","species")]
 rownames_amp <- otutaxonomy$qseqid[reftaxon]
 amp_matrix <- matrix(0, nrow = length(rownames_amp), ncol = length(all_Tabs), dimnames = list(rownames_amp,all_Tabs))
   for(i in seq_along(read_tabs)) {
   tab <- read.csv(read_tabs[i],sep='\t',header=T,as.is=TRUE,row.names = 1)
   tab <- tab[,samples] # order samples
   amp_index <- row.names(tab) #OTU names of current table
   ingroup_amps <- amp_index[which(amp_index %in% rownames_amp)]
   amp_readcounts <- rowSums(tab[ingroup_amps,]) 
   amp_matrix[ingroup_amps,i] <- amp_readcounts
   }
 amp_matrix <- cbind(amp_matrix,taxinfo)
 ampmatrices[[tt]] <- amp_matrix
}

taxon_plot <- list()
for (tt in c(1:length(investigated_taxa))){
 matr <- as.data.frame(ampmatrices[[tt]])

 gathered_matr <- gather(matr,method,abundance,1:40)

 filtered_matr <- filter(gathered_matr,abundance>0)

 method <- str_split_fixed(filtered_matr$method, "_", 3)[,1]
 method[method == "DADA2"] <- "DADA2(+VS)"
 method[method == "DADA2VSEARCH"] <- "DADA2(+VS)"
 level <- str_split_fixed(filtered_matr$method, "_", 3)[,2]
 level <- gsub(".planttable","",level)
 level[level == "0.95"] <- "95"
 level[level == "0.96"] <- "96"
 level[level == "0.97"] <- "97"
 level[level == "0.98"] <- "98"
 level[level == "0.985"] <- "98.5"
 level[level == "NO"] <- "99/100"
 level[level == "3"] <- "99/100"
 level[level == "5"] <- "98.5"
 level[level == "7"] <- "98"
 level[level == "10"] <- "97"
 level[level == "13"] <- "96"
 level[level == "15"] <- "95"
 level <- factor(level,levels = c("99/100", "98.5", "98", "97", "96", "95"))

 processed <- str_split_fixed(filtered_matr$method, "_", 3)[,3]
 luluindex <- which(processed == "luluprocessed")
 processed[luluindex] <- "curated"
 processed[-luluindex] <- "raw"

 filtered_matr$species2 <- paste0(str_split_fixed(filtered_matr$species, "_",2)[,1]," ",str_split_fixed(filtered_matr$species, "_",3)[,2]," ",str_split_fixed(filtered_matr$species, "_",4)[,3])

 filtered_matr_processed <- 
  data.frame(Method=method,Level=level,OTUtable=processed, pident=filtered_matr$pident, Annotation=filtered_matr$species2,abundance=filtered_matr$abundance)

 filtered_matr_processed$Annotation <- factor(filtered_matr_processed$Annotation)

 taxon_plot[[tt]] <- ggplot(filtered_matr_processed,aes(pident/100,abundance,col=OTUtable, shape=Annotation)) + geom_point(size=0.5, alpha = 0.8) + facet_grid(Level~Method) + scale_y_log10(labels = scales::comma) + scale_x_reverse(lim=c(1,0.70),labels = scales::percent) + scale_color_brewer(palette = "Set1") + theme_bw() + theme(text = element_text(size=8)) + xlab("Best match in reference data") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + theme(legend.text = element_text(size = 6)) + theme(legend.key = element_rect(size = 0.1), legend.key.size = unit(0.7, 'lines')) + guides(colour = guide_legend(order = 1), shape = guide_legend(order = 2))

}

taxon_plot[[1]] + scale_shape_manual(values=1:25)

Figure 17. Curation effect on OTUs assigned to Acer.
Three species of Acer (A. campestre, A. pseudoplatanus and A. platanoides) were recorded in the plant survey. Curation resulted in accurate diversity measures (except for CROP) and the annotation was also correct for two of the species. The third OTU imperfectly assigned to Acer mandshuricum probably indicates that no perfect match for the third species (A. platanoides) was found in the reference database. Log abundance (total read count of OTU) is plotted on the y-axis. Best match on GenBank is plotted on the x-axis. Blue points represent OTUs discarded by LULU (i.e. OTUs found only in the initial, uncurated OTU tables), and red points represent OTUs retained by the LULU curation, shapes represent species annotations according to best reference database match (GenBank). The “99/100%” clustering level denotes the pure DADA2 approach (100%) and SWARM with a d-value of 3 (99%).

\pagebreak

taxon_plot[[2]] + scale_shape_manual(values=1:25)

Figure 18. Curation effect on OTUs assigned to Alnus.
One species of Alnus (A. glutinosa) was recorded in the plant survey. Curation resulted in accurate diversity measures and correct annotation, except for CROP, where only the 98% approach correlated with expectations from the survey and the other methods. Log abundance (total read count of OTU) is plotted on the y-axis. Best match on GenBank is plotted on the x-axis. Blue points represent OTUs discarded by LULU (i.e. OTUs found only in the initial, uncurated OTU tables), and red points represent OTUs retained by the LULU curation, shapes represent species annotations according to best reference database match (GenBank). The “99/100%” clustering level denotes the pure DADA2 approach (100%) and SWARM with a d-value of 3 (99%).

\pagebreak

taxon_plot[[3]] + scale_shape_manual(values=1:25)

Figure 19. Curation effect on OTUs assigned to Avenella.
The single species of the monotypic genus Avenella (A. flexuosa) was recorded in the plant survey. Curation resulted in accurate diversity measures and correct annotation, except for 'DADA2-100%' and 'CROP-95%' both finding too many OTUs. Log abundance (total read count of OTU) is plotted on the y-axis. Best match on GenBank is plotted on the x-axis. Blue points represent OTUs discarded by LULU (i.e. OTUs found only in the initial, uncurated OTU tables), and red points represent OTUs retained by the LULU curation, shapes represent species annotations according to best reference database match (GenBank). The “99/100%” clustering level denotes the pure DADA2 approach (100%) and SWARM with a d-value of 3 (99%).

\pagebreak

taxon_plot[[4]] + scale_shape_manual(values=1:25)

Figure 20. Curation effect on OTUs assigned to Calamagrostis.
Two species of Calamagrostis (C. canescens and C. epigeios) were recorded in the plant survey. Curation resulted in accurate diversity measures and correct annotation, except for extreme clustering levels and for CROP. In most approaches, far too many OTUs and species names were identified in the un-curated data. Log abundance (total read count of OTU) is plotted on the y-axis. Best match on GenBank is plotted on the x-axis. Blue points represent OTUs discarded by LULU (i.e. OTUs found only in the initial, uncurated OTU tables), and red points represent OTUs retained by the LULU curation, shapes represent species annotations according to best reference database match (GenBank). The “99/100%” clustering level denotes the pure DADA2 approach (100%) and SWARM with a d-value of 3 (99%).

\pagebreak

taxon_plot[[5]] + scale_shape_manual(values=1:25)

Figure 21. Curation effect on OTUs assigned to Calluna.
The single species of the monotypic genus Calluna (C. vulgaris) was recorded in the plant survey. Curation resulted in accurate diversity measures and correct annotation, except for 'DADA2-100%', 'CROP-95%' and 'CROP-98%'. In most approaches far too many OTUs were identified in the un-curated data, although they all received the same annotation. Log abundance (total read count of OTU) is plotted on the y-axis. Best match on GenBank is plotted on the x-axis. Blue points represent OTUs discarded by LULU (i.e. OTUs found only in the initial, uncurated OTU tables), and red points represent OTUs retained by the LULU curation, shapes represent species annotations according to best reference database match (GenBank). The “99/100%” clustering level denotes the pure DADA2 approach (100%) and SWARM with a d-value of 3 (99%).

\pagebreak

taxon_plot[[6]] + scale_shape_manual(values=1:25)

Figure 22. Curation effect on OTUs assigned to Centaurea.
Three species of Centaurea (C. cyanus, C. jacea and C. scabiosa) were recorded in the plant survey. CROP did only identify one species in one approach (98%). Despite suboptimal annotation (only one name match, C. scabiosa), the richness estimates were improved by curation. Log abundance (total read count of OTU) is plotted on the y-axis. Best match on GenBank is plotted on the x-axis. Blue points represent OTUs discarded by LULU (i.e. OTUs found only in the initial, uncurated OTU tables), and red points represent OTUs retained by the LULU curation, shapes represent species annotations according to best reference database match (GenBank). The “99/100%” clustering level denotes the pure DADA2 approach (100%) and SWARM with a d-value of 3 (99%).

\pagebreak

taxon_plot[[7]] + scale_shape_manual(values=1:25)

Figure 23. Curation effect on OTUs assigned to Cerastium.
Two taxa of Cerastium (C. fontanum ssp. vulgare var. vulgare and C. semidecandrum) were recorded in the plant survey. 'CROP 98%' performed better that the two other clustering levels for that method. Curation resulted in better richness estimates for the other approaches, despite deviations in the names between plant survey and OTU for one species. Seemingly, initial clustering levels of 95%, 96% and 97% were too restrictive, as one of the supposed ‘good’ OTUs was lost. Log abundance (total read count of OTU) is plotted on the y-axis. Best match on GenBank is plotted on the x-axis. Blue points represent OTUs discarded by LULU (i.e. OTUs found only in the initial, uncurated OTU tables), and red points represent OTUs retained by the LULU curation, shapes represent species annotations according to best reference database match (GenBank). The “99/100%” clustering level denotes the pure DADA2 approach (100%) and SWARM with a d-value of 3 (99%).

\pagebreak

taxon_plot[[8]] + scale_shape_manual(values=1:25)

Figure 24. Curation effect on OTUs assigned to Erica.
One species (E. tetralix) was recorded in the plant survey. CROP retained suboptimal OTUs with a different abundance and/or number than the other methods. Curation resulted in accurate richness and annotation for the other methods. DADA2+VSEARCH performed accurately without curation for clustering levels 97%, 96% and 95%. Log abundance (total read count of OTU) is plotted on the y-axis. Best match on GenBank is plotted on the x-axis. Blue points represent OTUs discarded by LULU (i.e. OTUs found only in the initial, uncurated OTU tables), and red points represent OTUs retained by the LULU curation, shapes represent species annotations according to best reference database match (GenBank). The “99/100%” clustering level denotes the pure DADA2 approach (100%) and SWARM with a d-value of 3 (99%).

\pagebreak

taxon_plot[[9]] + scale_shape_manual(values=1:25)

Figure 25. Curation effect on OTUs assigned to Fagus.
One species (F. sylvatica) was recorded in the plant survey. CROP initially found the most realistic number of OTUs, despite selecting a suboptimal sequence in the 97% setting, and one OTU impervious to curation in the 98% setting. The single OTU with a 100% match with a read abundance of around 10 (in VSEARCH, DADA2 and SWARM), impervious to curation, was in fact a fungal sequence mathcing a reference in GenBank erroneously annotated as Fagus. As this fungal sequence had distribution and abundance pattern contrasting that of Fagus, it was not discarded. Curation made the richness significantly more realistic for all approaches, although the SWARM method retained several redundant OTUs after duration. Log abundance (total read count of OTU) is plotted on the y-axis. Best match on GenBank is plotted on the x-axis. Blue points represent OTUs discarded by LULU (i.e. OTUs found only in the initial, uncurated OTU tables), and red points represent OTUs retained by the LULU curation, shapes represent species annotations according to best reference database match (GenBank). The “99/100%” clustering level denotes the pure DADA2 approach (100%) and SWARM with a d-value of 3 (99%).

\pagebreak

taxon_plot[[10]] + scale_shape_manual(values=1:25)

Figure 26. Curation effect on OTUs assigned to Filipendula.
Two species of Filipendula (F. ulmaria and F. vulgaris) were recorded in the plant survey. CROP identified too few and suboptimal OTUs. Curation resulted in accurate taxonomic composition and richness for the other methods, which all identified too many OTUs at all levels. Log abundance (total read count of OTU) is plotted on the y-axis. Best match on GenBank is plotted on the x-axis. Blue points represent OTUs discarded by LULU (i.e. OTUs found only in the initial, uncurated OTU tables), and red points represent OTUs retained by the LULU curation, shapes represent species annotations according to best reference database match (GenBank). The “99/100%” clustering level denotes the pure DADA2 approach (100%) and SWARM with a d-value of 3 (99%).

\pagebreak

taxon_plot[[11]] + scale_shape_manual(values=1:25)

Figure 27. Curation effect on OTUs assigned to Holcus.
Curation effect on all OTUs assigned to the genus Holcus. Two species (H. lanatus and H. mollis) were recorded in the plant survey. CROP identified too few and suboptimal OTUs. Curation resulted in accurate richness estimates for the other methods at clustering levels of 97% and upwards. Both retained OTUs were assigned to H. lanatus, one perfectly and the other with lower match, probably due to H. mollis not being represented in the reference database. Log abundance (total read count of OTU) is plotted on the y-axis. Best match on GenBank is plotted on the x-axis. Blue points represent OTUs discarded by LULU (i.e. OTUs found only in the initial, uncurated OTU tables), and red points represent OTUs retained by the LULU curation, shapes represent species annotations according to best reference database match (GenBank). The “99/100%” clustering level denotes the pure DADA2 approach (100%) and SWARM with a d-value of 3 (99%).

\pagebreak

taxon_plot[[12]] + scale_shape_manual(values=1:25)

Figure 28. Curation effect on OTUs assigned to Littorella.
Curation effect on all OTUs assigned to the genus Littorella, which is sometimes considered a part of Plantago (see below for data on Plantago). One species (L. uniflora) is known from the study area and was recorded in the plant survey. All approaches identified the correctly annotated species, but only 'CROP-95%' did not identify any extra OTUs. The other approaches initially identified too many OTUs at all clustering levels. Curation resulted in accurate richness for all approaches. Log abundance (total read count of OTU) is plotted on the y-axis. Best match on GenBank is plotted on the x-axis. Blue points represent OTUs discarded by LULU (i.e. OTUs found only in the initial, uncurated OTU tables), and red points represent OTUs retained by the LULU curation, shapes represent species annotations according to best reference database match (GenBank). The “99/100%” clustering level denotes the pure DADA2 approach (100%) and SWARM with a d-value of 3 (99%).

\pagebreak

taxon_plot[[13]] + scale_shape_manual(values=1:25)

Figure 29. Curation effect on OTUs assigned to Lysimachia.
Two species (L. thyrsiflora and L. vulgaris) were recorded in the plant survey. Except for CROP, curation resulted in accurate taxonomic composition and richness for all methods, which all initially identified too many OTUs at all levels. Log abundance (total read count of OTU) is plotted on the y-axis. Best match on GenBank is plotted on the x-axis. Blue points represent OTUs discarded by LULU (i.e. OTUs found only in the initial, uncurated OTU tables), and red points represent OTUs retained by the LULU curation, shapes represent species annotations according to best reference database match (GenBank). The “99/100%” clustering level denotes the pure DADA2 approach (100%) and SWARM with a d-value of 3 (99%).

\pagebreak

taxon_plot[[14]] + scale_shape_manual(values=1:25)

Figure 30. Curation effect on OTUs assigned to Menyanthes.
The single species of the monotypic genus Menyanthes (M. trifoliata) was recorded in the plant survey. All methods initially identified too many OTUs, despite correct annotation. Curation resulted in accurate richness for all methods except 'CROP-97%' with one OTU too much. Log abundance (total read count of OTU) is plotted on the y-axis. Best match on GenBank is plotted on the x-axis. Blue points represent OTUs discarded by LULU (i.e. OTUs found only in the initial, uncurated OTU tables), and red points represent OTUs retained by the LULU curation, shapes represent species annotations according to best reference database match (GenBank). The “99/100%” clustering level denotes the pure DADA2 approach (100%) and SWARM with a d-value of 3 (99%).

\pagebreak

taxon_plot[[15]] + scale_shape_manual(values=1:25)

Figure 31. Curation effect on OTUs assigned to Plantago.
Three species (P. lanceolata, P. major and P. maritima) were recorded in the plant survey (for data on Littorella uniflora (=P. uniflora) see above). CROP selected suboptimal OTUs and/or too few. Curation resulted in accurate richness and annotation for all other methods. DADA2-95% and VSEACRH-95% performed accurate without curation also. Log abundance (total read count of OTU) is plotted on the y-axis. Best match on GenBank is plotted on the x-axis. Blue points represent OTUs discarded by LULU (i.e. OTUs found only in the initial, uncurated OTU tables), and red points represent OTUs retained by the LULU curation, shapes represent species annotations according to best reference database match (GenBank). The “99/100%” clustering level denotes the pure DADA2 approach (100%) and SWARM with a d-value of 3 (99%).

\pagebreak

taxon_plot[[16]] + scale_shape_manual(values=1:25)

Figure 32. Curation effect on OTUs assigned to Poa.
Curation effect on all OTUs assigned to the genus Poa. Five species of Poa (P. annua, P. compressa, P. nemoralis, P. pratensis and P. trivialis) were recorded in the plant survey. Grasses often pose problems in molecular identification. Here, curation resulted in a very good correspondence between what could be expected from the plant survey, despite richness being slightly underestimated (4 out of five 5 species) in most approaches and one or two ill-assigned species. Log abundance (total read count of OTU) is plotted on the y-axis. Best match on GenBank is plotted on the x-axis. Blue points represent OTUs discarded by LULU (i.e. OTUs found only in the initial, uncurated OTU tables), and red points represent OTUs retained by the LULU curation, shapes represent species annotations according to best reference database match (GenBank). The “99/100%” clustering level denotes the pure DADA2 approach (100%) and SWARM with a d-value of 3 (99%).

\pagebreak

tobiasgf/lulu documentation built on Jan. 17, 2024, 3:57 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

tobiasgf/lulu
Post Clustering Curation of Amplicon Data for Reliable Biodiversity Metrics

In tobiasgf/lulu: Post Clustering Curation of Amplicon Data for Reliable Biodiversity Metrics

R Package Documentation

Browse R Packages

We want your feedback!

tobiasgf/lulu Post Clustering Curation of Amplicon Data for Reliable Biodiversity Metrics

In tobiasgf/lulu: Post Clustering Curation of Amplicon Data for Reliable Biodiversity Metrics

R Package Documentation

Browse R Packages

We want your feedback!

tobiasgf/lulu
Post Clustering Curation of Amplicon Data for Reliable Biodiversity Metrics